This ensures that all results are replicable
set.seed(5241988)
Define a function that will take a list of required packages, install any that are missing, and add all packages to the local workspace:
get_packages<-function(package_list){
# Install packages not yet installed
installed_packages <- package_list %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(package_list[!installed_packages])
}
# Packages loading
invisible(lapply(package_list, require, character.only = TRUE))}
Apply to the package list for this analysis:
packages<-c('dplyr','tidyr','magrittr','ggplot2','statcomp','bayesplot','rstanarm','bayestestR','BayesFactor','irr','gridExtra','parallel','shinystan')
get_packages(packages)
Set some environmental variables:
options(mc.cores=detectCores())
test_regex="^scale.*|^\\(Int.*"
regex="^\\(Int|^Age.*|^Sex.*|^Birth.*|^s\\(B.*"
my_probs=c(0.0275,0.5,0.975)
Set color scheme and line types for tokens
voiced_col<-RColorBrewer::brewer.pal(4, "Paired")[1]
voiced_col_alt<-RColorBrewer::brewer.pal(4, "Paired")[2]
voiceless_col<-RColorBrewer::brewer.pal(4, "Paired")[3]
voiceless_col_alt<-RColorBrewer::brewer.pal(4, "Paired")[4]
male_col<-RColorBrewer::brewer.pal(4, "Accent")[2]
female_col<-RColorBrewer::brewer.pal(4, "Accent")[3]
female_line<-'solid'
male_line<-'dashed'
Set general theme for plots
theme_set(theme_minimal()+
theme(plot.title = element_text(size=22),
legend.key.size = unit(1,"cm"),
legend.text = element_text(size=12),
legend.title = element_text(size=15),
panel.spacing=unit(2,"lines"),
panel.background=element_rect(color='black'),
strip.text=element_text(color='black',size=18),
axis.text=element_text(color='black',size=12),
axis.title=element_text(color='black',size=15)))
By default, the chunks in this notebook that pertain to the statistical analysis have been set not to evaluate. This will load all models to the workspace so they can be queried without having to wait for them to run.
load('../Data/all_models_final.RData')
By default, we’ll assume that the script hasn’t been moved from its
original location and that all data files are in the same directory as
this script. If that’s not true, use setwd() to change the
working directory first.
Load in test set and process
test_set=read.delim('../Data/testset_results.txt')
process_test_set=function(data){
data$Speaker=with(data,factor(ifelse(substr(File,1,1)=='A',"A","G")))
data$File=as.factor(data$File)
data$Phone=as.factor(data$Phone)
data$Word=as.factor(data$Word)
data$Mean_HNR=as.numeric(data$Mean_HNR)
return(data)}
test_set %<>% process_test_set()
Divide into voiceless and voiced
voiced_test_set<-test_set %>% filter(.,Phone=='DH'|Phone=='D') %>% droplevels()
voiceless_test_set<-test_set %>% filter(.,Phone=="TH"|Phone=="T") %>% droplevels()
voiced_test_set$Phone<-relevel(voiced_test_set$Phone,ref="DH")
voiceless_test_set$Phone<-relevel(voiceless_test_set$Phone,ref="TH")
phones<-rev(c('/θ/','/t/','/ð/','/d/'))
cog<-ggplot(test_set,aes(y=Phone,x=CoG,fill=Phone))+geom_violin()+stat_summary(geom='errorbar',width=0.25,fun.data='mean_cl_boot')+scale_fill_brewer(palette='Paired')+scale_y_discrete(labels=phones)+coord_cartesian(xlim=c(0,1500))+theme(legend.position='none')+labs(x="Center of Gravity")
skew<-ggplot(test_set,aes(x=Skewness,y=Phone,fill=Phone))+geom_violin()+stat_summary(geom='errorbar',width=0.25,fun.data='mean_cl_boot')+scale_fill_brewer(palette='Paired')+scale_y_discrete(labels=phones)+coord_cartesian(xlim=c(0,25))+theme(legend.position='none')
kur<-ggplot(test_set,aes(x=Kurtosis,y=Phone,fill=Phone))+geom_violin()+stat_summary(geom='errorbar',width=0.25,fun.data='mean_cl_boot')+scale_fill_brewer(palette='Paired')+scale_y_discrete(labels=phones)+coord_cartesian(xlim=c(0,500))+theme(legend.position='none')
hnr<-ggplot(test_set,aes(x=Mean_HNR,y=Phone,fill=Phone))+geom_violin()+stat_summary(geom='errorbar',width=0.25,fun.data='mean_cl_boot')+scale_fill_brewer(palette='Paired')+scale_y_discrete(labels=phones)+theme(legend.position='none')+labs(x="Mean HNR")
grid.arrange(cog,hnr,skew,kur)->figure1
ms_testset=stan_glmer(Phone~scale(Mean_HNR)+scale(CoG)+scale(Skewness)+scale(Kurtosis)+(1|Speaker),voiceless_test_set,family='binomial',adapt_delta=0.99,warmup=2000,iter=6000,chains=8)
ms_testset_loo<-loo(ms_testset,k_threshold=0.7)
mv_testset=stan_glmer(Phone~scale(Mean_HNR)+scale(CoG)+scale(Skewness)+scale(Kurtosis)+(1|Speaker),voiced_test_set,family='binomial',adapt_delta=0.99,warmup=2000,iter=6000,chains=8)
mv_testset_loo<-loo(mv_testset,k_threshold=0.7)
summary(ms_testset,regex_pars=test_regex,probs=my_probs,digits=3)
Model Info:
function: stan_glmer
family: binomial [logit]
formula: Phone ~ scale(Mean_HNR) + scale(CoG) + scale(Skewness) + scale(Kurtosis) +
(1 | Speaker)
algorithm: sampling
sample: 32000 (posterior sample size)
priors: see help('prior_summary')
observations: 334
groups: Speaker (2)
Estimates:
mean sd 2.75% 50% 97.5%
(Intercept) 0.510 0.507 -0.545 0.529 1.508
scale(Mean_HNR) -0.035 0.173 -0.369 -0.036 0.307
scale(CoG) -0.821 0.203 -1.221 -0.815 -0.439
scale(Skewness) -1.888 0.391 -2.653 -1.883 -1.138
scale(Kurtosis) 0.939 0.343 0.282 0.939 1.619
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.012 1.003 1861
scale(Mean_HNR) 0.001 1.000 26973
scale(CoG) 0.002 1.000 18019
scale(Skewness) 0.003 1.000 15284
scale(Kurtosis) 0.003 1.000 16051
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
summary(mv_testset,regex_pars=test_regex,probs=my_probs,digits=3)
Model Info:
function: stan_glmer
family: binomial [logit]
formula: Phone ~ scale(Mean_HNR) + scale(CoG) + scale(Skewness) + scale(Kurtosis) +
(1 | Speaker)
algorithm: sampling
sample: 32000 (posterior sample size)
priors: see help('prior_summary')
observations: 264
groups: Speaker (2)
Estimates:
mean sd 2.75% 50% 97.5%
(Intercept) 0.664 0.870 -1.172 0.690 2.419
scale(Mean_HNR) -1.133 0.285 -1.697 -1.129 -0.582
scale(CoG) -0.273 0.244 -0.746 -0.271 0.203
scale(Skewness) 1.406 0.482 0.504 1.399 2.370
scale(Kurtosis) -1.436 0.431 -2.286 -1.429 -0.613
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.008 1.000 10948
scale(Mean_HNR) 0.002 1.000 23745
scale(CoG) 0.002 1.000 18239
scale(Skewness) 0.004 1.000 15339
scale(Kurtosis) 0.003 1.000 16371
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
print("Voiceless Data:")
[1] "Voiceless Data:"
hdi(ms_testset)
Highest Density Interval
Parameter | 95% HDI
--------------------------------
(Intercept) | [-0.55, 1.55]
scale(Mean_HNR) | [-0.38, 0.30]
scale(CoG) | [-1.22, -0.43]
scale(Skewness) | [-2.65, -1.12]
scale(Kurtosis) | [ 0.27, 1.62]
bayesfactor(ms_testset)
Sampling priors, please wait...
Warning: Bayes factors might not be precise.
For precise Bayes factors, sampling at least 40,000 posterior samples is recommended.
Bayes Factor (Savage-Dickey density ratio)
Parameter | BF
--------------------------
(Intercept) | 0.770
scale(Mean_HNR) | 0.070
scale(CoG) | 429.50
scale(Skewness) | 3.46e+03
scale(Kurtosis) | 6.16
* Evidence Against The Null: 0
print("Voiced Data:")
[1] "Voiced Data:"
hdi(mv_testset)
Highest Density Interval
Parameter | 95% HDI
--------------------------------
(Intercept) | [-1.10, 2.54]
scale(Mean_HNR) | [-1.69, -0.56]
scale(CoG) | [-0.76, 0.20]
scale(Skewness) | [ 0.50, 2.38]
scale(Kurtosis) | [-2.29, -0.60]
bayesfactor(mv_testset)
Sampling priors, please wait...
Warning: Bayes factors might not be precise.
For precise Bayes factors, sampling at least 40,000 posterior samples is recommended.
Bayes Factor (Savage-Dickey density ratio)
Parameter | BF
------------------------
(Intercept) | 0.567
scale(Mean_HNR) | 273.53
scale(CoG) | 0.183
scale(Skewness) | 17.11
scale(Kurtosis) | 61.60
* Evidence Against The Null: 0
color_scheme_set(c(voiceless_col,rep("#000000",5)))
ms_testset_post<-mcmc_areas(ms_testset,regex_pars=test_regex,prob_outer=0.95)+ggtitle("Voiceless")+scale_y_discrete(labels=c("Intercept","Mean HNR","CoG","Skewness","Kurtosis"))+geom_vline(xintercept=0,linetype='dotted')+labs(x="Stop Likelihood")
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
color_scheme_set(c(voiced_col,rep("#000000",5)))
mv_testset_post<-mcmc_areas(mv_testset,regex_pars=test_regex,prob_outer=0.95)+ggtitle("Voiced")+scale_y_discrete(labels=c("Intercept","Mean HNR","CoG","Skewness","Kurtosis"))+geom_vline(xintercept=0,linetype='dotted')+labs(x="Stop Likelihood")
Scale for y is already present.
Adding another scale for y, which will replace the existing scale.
grid.arrange(ms_testset_post,mv_testset_post,ncol=2)->figure2
ms_testset_rope<-plot(rope(ms_testset,range=rope_range(ms_testset),ci_method='hdi'),alpha=0.75)+ggtitle("Voiceless")+scale_fill_manual(values=c(voiceless_col,voiceless_col_alt))
Possible multicollinearity between scale(Kurtosis) and scale(Skewness) (r = 0.87). This might lead to inappropriate results. See 'Details' in '?rope'.
mv_testset_rope<-plot(rope(mv_testset,range=rope_range(mv_testset),ci_method='hdi'),alpha=0.75)+ggtitle("Voiced")+scale_fill_manual(values=c(voiced_col,voiced_col_alt))
Possible multicollinearity between scale(Kurtosis) and scale(Skewness) (r = 0.88). This might lead to inappropriate results. See 'Details' in '?rope'.
grid.arrange(ms_testset_rope,mv_testset_rope,ncol=1)
launch_shinystan(ms_testset)
launch_shinystan(mv_testset)
PREP = Puerto Rican English in Philadelphia Corpus
By default, we’ll assume that the script hasn’t been moved from its
original location and that all data files are in the same directory as
this script. If that’s not true, use setwd() to change the
working directory first.
prep_data=read.delim('../Data/prep_data.txt')
socio_data=read.csv('../Data/prep_socio.csv')
perceptual_data=read.delim('../Data/perceptual_data.txt')
perceptual_tokens_voicing_codes=read.delim('../Data/voicing_codes.txt')
Start by getting phonetic context
prep_data$FollowingStress=as.factor(with(prep_data,substr(FollowingPhone,nchar(FollowingPhone),nchar(FollowingPhone))))
prep_data$FollowingVowel=as.factor(with(prep_data,substr(FollowingPhone,1,nchar(FollowingPhone)-1)))
prep_data$FollowingPhone=as.factor(prep_data$FollowingPhone)
Remove tokens that aren’t prevocalic (i.e., are consonant clusters)
prep_data %<>% filter(!(FollowingPhone %in% list('R','W','Y')))
prep_data %<>% mutate(Voicing=if_else(Phone %in% list('D','DH'),'Voiced','Voiceless'),Manner=if_else(Phone %in% list('D','T'),'Stop','Fricative'),Metric=if_else(Voicing=='Voiced','Mean_HNR','Skewness'))
Join to sociodemographic data and specify data structure
prep_data %>% left_join(.,socio_data, by=c('File'='Speaker'))->prep_socio # Merge socio data with speaker data
prep_socio$Sex=as.factor(prep_socio$Sex)
prep_socio$Mean_HNR=as.numeric(prep_socio$Mean_HNR)
prep_socio$File=as.factor(prep_socio$File)
prep_socio$Word=gsub("\\(\\(","",prep_socio$Word)
prep_socio$Word=gsub("\\)\\)","",prep_socio$Word)
prep_socio$Word=as.factor(prep_socio$Word)
prep_socio$Phone=as.factor(prep_socio$Phone)
prep_socio %<>% mutate(AgeGroup=if_else(BirthYear>=1985,'Younger','Older'),Speaker=File)
Get average values by speaker, voicing, and phonetic context
prep_data$PrecedingPhone=as.factor(prep_data$PrecedingPhone)
prep_data$PrecedingWord=as.factor(prep_data$PrecedingWord)
prep_data$Skewness=as.numeric(prep_data$Skewness)
prep_data$Mean_HNR=as.numeric(prep_data$Mean_HNR)
speaker_avgs<-prep_data %>% filter(Phone=='D' | Phone=='T') %>%
group_by(File, Phone, FollowingVowel, FollowingStress) %>%
reframe(count=n(),HNR_stop=mean(na.omit(Mean_HNR)),sd_HNR_stop=sd(na.omit(Mean_HNR)), skewness_stop=mean(na.omit(Skewness)),sd_skewness_stop=sd(na.omit(Skewness))) %>%
ungroup()
voiced_avgs<-speaker_avgs %>% filter(Phone=='D') %>% select(-Phone)
voiceless_avgs<-speaker_avgs %>% filter(Phone=='T') %>% select(-Phone)
Functions to calculate normalized stopping ratios and remove outliers >3 standard deviations from the group mean, by speaker.
make_stopping_ratios<-function(data,p){
data %<>% filter(Phone==p) %>%
left_join(voiced_avgs,by=c('File','FollowingVowel','FollowingStress')) %>%
mutate(HNR_ratio=Mean_HNR/HNR_stop,skewness_ratio=Skewness/skewness_stop) %>%
mutate(ratio=(if_else(Phone=='TH',skewness_ratio,if_else(Phone=="DH",HNR_ratio,NA)))) %>%
left_join(socio_data,by=c('File'='Speaker')) %>% rename(Speaker=File) %>%
mutate(AgeGroup=as.factor(ifelse(BirthYear>=1985,'Younger','Older')))
return(data)
}
remove_outliers<-function(data){
data %>% filter(!is.na(ratio)) %>% mutate(Pseudonym=ifelse(.$Speaker=='S39','Carina',.$Pseudonym))%>%
mutate(mean_ratio_byspeaker=mean(ratio),sd_ratio_byspeaker=sd(ratio)) %>%
ungroup() %>%
filter(abs(ratio-mean_ratio_byspeaker)<=3*sd_ratio_byspeaker) %>%
filter(abs(ratio-mean(ratio))<=3*sd(ratio))->result
print(paste0("Removed ",nrow(data)-nrow(result)," rows, or ",round(100*(1-nrow(result)/nrow(data)),2),"% of data."))
return(result)
}
Apply to data and create production dataset
voiceless<-make_stopping_ratios(prep_data,"TH") %>% remove_outliers()
[1] "Removed 76 rows, or 18.01% of data."
voiced<-make_stopping_ratios(prep_data,"DH") %>% remove_outliers()
[1] "Removed 1163 rows, or 23.34% of data."
production_data<-rbind(voiced,voiceless)
production_data %>% group_by(Word,Phone) %>% summarise(count=n()) %>% arrange(desc(count)) %>% {.->> word_cts} %>%
ggplot(.,aes(x=count))+geom_density()+scale_x_continuous(breaks=seq(0,500,by=50))+coord_cartesian(xlim=c(0,500))+ggtitle("Word Distribution")
`summarise()` has grouped output by 'Word'. You can override using the `.groups` argument.
Note that only 2 of the top 10 words are /θ/ words (think,things)
word_cts %>%
filter(count>=40&(Phone=='DH'|Phone=='TH')) %>%
arrange(desc(count)) %>%
head(20)
Top 10 /ð/ words
word_cts %>%
filter(Phone=='DH') %>%
arrange(desc(count)) %>%
head(10)
Top 10 /θ/ words
word_cts %>%
filter(Phone=='TH') %>%
arrange(desc(count)) %>%
head(10)
Visualize distribution of fricatives and stops across data by metric
ggplot(prep_socio,aes(x=if_else(Voicing=='Voiced',Mean_HNR,Skewness),fill=Manner))+
geom_density(alpha=0.5)+facet_grid(Voicing~.)+
labs(x="Metric",y="Density")+scale_fill_viridis_d(option='cividis',begin=0.3,end=0.9)+theme(legend.position='bottom')
ggplot(prep_socio,aes(x=if_else(Voicing=='Voiced',Mean_HNR,Skewness),fill=Manner))+
geom_density(alpha=0.5)+facet_grid(Voicing~Sex)+
labs(x="Metric",y="Density")+scale_fill_viridis_d(option='viridis',begin=0.2,end=0.6)+theme(legend.position='bottom')
ggplot(prep_socio,aes(x=if_else(Voicing=='Voiced',Mean_HNR,Skewness),fill=Manner))+
geom_density(alpha=0.5)+facet_grid(Sex~AgeGroup+Voicing)+
labs(x="Metric",y="Density")+scale_fill_viridis_d(option='inferno')+theme(legend.position='bottom')
Individual ranges to see if there are any major outliers that remain after trimming procedure
ggplot(voiceless,aes(x=Pseudonym,y=ratio))+
geom_boxplot(fill=voiceless_col)+
labs(x="Speaker",y="Normalized Stopping Ratio\n(Skewness)")+
ggtitle("Voiceless")+scale_y_continuous(breaks=seq(-1,5,by=0.5))+geom_hline(yintercept=1,linetype='dashed')+
coord_flip()->p_voiceless_byspeaker
ggplot(voiced,aes(x=Pseudonym,y=ratio))+
geom_boxplot(fill=voiced_col)+
labs(x="Speaker",y="Normalized Stopping Ratio\n(Mean HNR)")+
scale_y_continuous(breaks=seq(-1,5,by=1))+
ggtitle("Voiced")+geom_hline(yintercept=1,linetype='dashed')+
coord_flip()->p_voiced_byspeaker
grid.arrange(p_voiceless_byspeaker,p_voiced_byspeaker,ncol=2)
View general trends by birth year
age_voiceless<-ggplot(voiceless,aes(x=BirthYear,y=skewness_ratio-1,linetype=Sex))+
geom_hline(yintercept=0,linewidth=0.5,linetype='dotted')+geom_rug(sides="br",linetype='solid')+
geom_smooth(aes(group=Sex),color='black',fill=voiceless_col,method='gam',formula=y~s(x,k=3,m=2))+
scale_fill_manual(values=c(male_col,female_col))+
scale_y_continuous(breaks=seq(-1.5,1.5,by=0.5))+geom_point(alpha=0.05,position=position_jitter(0.3))+
labs(x="Birth Year",y="Skewness Ratio\n(Deviance from 1)")+coord_cartesian(ylim=c(-1.5,1.5))+
ggtitle("Voiceless")+theme(legend.position='bottom',strip.text=element_text(size=15))+scale_x_continuous(expand = expansion(mult = 0.1))
age_voiced<-ggplot(voiced,aes(x=BirthYear,y=HNR_ratio-1,linetype=Sex))+
geom_hline(yintercept=0,linewidth=0.5,linetype='dotted')+geom_rug(sides="br",linetype='solid')+
geom_smooth(aes(group=Sex),color='black',fill=voiced_col,method='gam',formula=y~s(x,k=3,m=2))+
scale_y_continuous(breaks=seq(-1.5,1.5,by=0.5))+geom_point(alpha=0.05,position=position_jitter(0.3))+
labs(x="Birth Year",y="HNR Ratio\n(Deviance from 1)")+
ggtitle("Voiced")+coord_cartesian(ylim=c(-1.5,1.5))+
theme(legend.position='bottom',strip.text=element_text(size=15))+scale_x_continuous(expand = expansion(mult = 0.1))
grid.arrange(age_voiceless,age_voiced,ncol=2)->figure4
print(figure4)
TableGrob (1 x 2) "arrange": 2 grobs
View trends by birth year, grouping by speaker
age_voiced_byspeaker<-ggplot(voiced,aes(x=BirthYear,y=HNR_ratio))+
geom_hline(yintercept=1,linewidth=0.5,linetype='dotted')+
geom_boxplot(aes(group=Speaker),fill=voiced_col)+
labs(x="Birth Year",y="Normalized HNR Ratio")+ggtitle("Voiced")+
theme(legend.position='bottom')+scale_x_continuous(expand = expansion(mult = 0.1))+
facet_wrap(~Sex,ncol=1)
age_voiceless_byspeaker<-ggplot(voiceless,aes(x=BirthYear,y=skewness_ratio))+
geom_hline(yintercept=1,linewidth=0.5,linetype='dotted')+
geom_boxplot(aes(group=Speaker),fill=voiceless_col)+
labs(x="Birth Year",y="Normalized Skewness Ratio")+ggtitle("Voiceless")+
theme(legend.position='bottom')+
scale_x_continuous(expand = expansion(mult = 0.1))+
facet_wrap(~Sex,ncol=1)
grid.arrange(age_voiceless_byspeaker,age_voiced_byspeaker,ncol=2)
Set Contrasts
set_contrasts<-function(data){
data$Sex<-as.factor(data$Sex)
data$Sex<-relevel(data$Sex,ref='Male')
data$AgeGroup=as.factor(data$AgeGroup)
data$AgeGroup<-relevel(data$AgeGroup,ref="Older")
return(data)
}
voiced %<>% set_contrasts()
voiceless %<>% set_contrasts()
voiced$HNR_ratio_deviance=voiced$HNR_ratio-1
voiceless$skewness_ratio_deviance=voiceless$skewness_ratio-1
ms_prep_gamm<-stan_gamm4(skewness_ratio_deviance~Sex*BirthYear+s(BirthYear,bs='tp',by=Sex,k=3,m=2),random=~(1|Speaker),data=voiceless,adapt_delta=0.999,warmup=2000,iter=6000,chains=8)
ms_prep_gamm_loo<-loo(ms_prep_gamm,k_threshold=0.7)
mv_prep_gamm<-stan_gamm4(HNR_ratio_deviance~Sex*BirthYear+s(BirthYear,bs='tp',by=Sex,k=3,m=2),random=~(1|Speaker),data=voiced,adapt_delta=0.999,warmup=2000,iter=6000,chains=8)
mv_prep_gamm_loo<-loo(mv_prep_gamm,k_threshold=0.7)
summary(ms_prep_gamm,regex_pars=regex,probs=my_probs,digits=3)
Model Info:
function: stan_gamm4
family: gaussian [identity]
formula: skewness_ratio_deviance ~ Sex * BirthYear + s(BirthYear, bs = "tp",
by = Sex, k = 3, m = 2)
algorithm: sampling
sample: 32000 (posterior sample size)
priors: see help('prior_summary')
observations: 346
groups: Speaker (32)
Estimates:
mean sd 2.75% 50% 97.5%
(Intercept) 10.825 28.095 -43.075 9.385 68.913
SexFemale -0.093 1.775 -3.491 -0.106 3.409
BirthYear -0.005 0.014 -0.034 -0.005 0.023
SexFemale:BirthYear 0.000 0.001 -0.002 0.000 0.002
s(BirthYear):SexMale.1 -0.023 0.231 -0.532 -0.006 0.471
s(BirthYear):SexMale.2 -0.072 0.187 -0.465 -0.046 0.288
s(BirthYear):SexFemale.1 -0.208 0.208 -0.655 -0.179 0.107
s(BirthYear):SexFemale.2 0.063 0.185 -0.291 0.040 0.460
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.280 1.001 10047
SexFemale 0.012 1.000 20838
BirthYear 0.000 1.001 10046
SexFemale:BirthYear 0.000 1.000 20858
s(BirthYear):SexMale.1 0.002 1.000 20402
s(BirthYear):SexMale.2 0.002 1.001 10350
s(BirthYear):SexFemale.1 0.002 1.001 10682
s(BirthYear):SexFemale.2 0.002 1.001 10220
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
summary(mv_prep_gamm,regex_pars=regex,probs=my_probs,digits=3)
Model Info:
function: stan_gamm4
family: gaussian [identity]
formula: HNR_ratio_deviance ~ Sex * BirthYear + s(BirthYear, bs = "tp",
by = Sex, k = 3, m = 2)
algorithm: sampling
sample: 32000 (posterior sample size)
priors: see help('prior_summary')
observations: 3819
groups: Speaker (32)
Estimates:
mean sd 2.75% 50% 97.5%
(Intercept) -6.346 28.280 -64.967 -5.238 53.315
SexFemale 0.047 1.970 -3.695 0.032 3.921
BirthYear 0.003 0.014 -0.026 0.003 0.034
SexFemale:BirthYear 0.000 0.001 -0.002 0.000 0.002
s(BirthYear):SexMale.1 0.161 0.237 -0.209 0.110 0.717
s(BirthYear):SexMale.2 0.057 0.194 -0.330 0.038 0.475
s(BirthYear):SexFemale.1 0.094 0.134 -0.123 0.070 0.399
s(BirthYear):SexFemale.2 -0.057 0.195 -0.465 -0.039 0.352
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.282 1.000 10059
SexFemale 0.013 1.000 21484
BirthYear 0.000 1.000 10058
SexFemale:BirthYear 0.000 1.000 21443
s(BirthYear):SexMale.1 0.002 1.000 12413
s(BirthYear):SexMale.2 0.002 1.000 10323
s(BirthYear):SexFemale.1 0.001 1.001 8250
s(BirthYear):SexFemale.2 0.002 1.001 10171
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
hdi(ms_prep_gamm)
Highest Density Interval
Parameter | 95% HDI
-------------------------------------
(Intercept) | [-45.07, 68.84]
SexFemale | [ -3.53, 3.45]
BirthYear | [ -0.03, 0.02]
SexFemale:BirthYear | [ -0.00, 0.00]
# Fixed effects smooth_sd
Parameter | Function | 95% HDI
----------------------------------------------------
s(BirthYear):SexMale1 | smooth | [ 0.00, 0.92]
s(BirthYear):SexMale2 | smooth | [ 0.00, 0.86]
s(BirthYear):SexFemale1 | smooth | [ 0.00, 1.01]
s(BirthYear):SexFemale2 | smooth | [ 0.00, 0.85]
bayesfactor(ms_prep_gamm)
Sampling priors, please wait...
Warning: Bayes factors might not be precise.
For precise Bayes factors, sampling at least 40,000 posterior samples is recommended.
Bayes Factor (Savage-Dickey density ratio)
Parameter | BF
---------------------------
(Intercept) | 0.119
SexFemale | 0.707
BirthYear | 0.119
SexFemale:BirthYear | 0.692
* Evidence Against The Null: 0
hdi(mv_prep_gamm)
Highest Density Interval
Parameter | 95% HDI
-------------------------------------
(Intercept) | [-66.83, 53.56]
SexFemale | [ -3.70, 4.00]
BirthYear | [ -0.03, 0.03]
SexFemale:BirthYear | [ -0.00, 0.00]
# Fixed effects smooth_sd
Parameter | Function | 95% HDI
----------------------------------------------------
s(BirthYear):SexMale1 | smooth | [ 0.00, 1.09]
s(BirthYear):SexMale2 | smooth | [ 0.00, 0.96]
s(BirthYear):SexFemale1 | smooth | [ 0.00, 0.93]
s(BirthYear):SexFemale2 | smooth | [ 0.00, 0.97]
bayesfactor(mv_prep_gamm)
Sampling priors, please wait...
Warning: Bayes factors might not be precise.
For precise Bayes factors, sampling at least 40,000 posterior samples is recommended.
Bayes Factor (Savage-Dickey density ratio)
Parameter | BF
---------------------------
(Intercept) | 0.094
SexFemale | 0.702
BirthYear | 0.094
SexFemale:BirthYear | 0.694
* Evidence Against The Null: 0
color_scheme_set(c(voiceless_col,rep("#000000",5)))
ms_prep_gamm_post<-mcmc_areas(ms_prep_gamm,regex_pars=regex,prob_outer=0.95)+ggtitle("Voiceless")
color_scheme_set(c(voiced_col,rep("#000000",5)))
mv_prep_gamm_post<-mcmc_areas(mv_prep_gamm,regex_pars=regex,prob_outer=0.95)+ggtitle("Voiced")
grid.arrange(ms_prep_gamm_post,mv_prep_gamm_post,ncol=2)
Nonlinear Plot
color_scheme_set(c(voiceless_col,rep("#000000",5)))
ms_prep_gamm_nl<-plot_nonlinear(ms_prep_gamm,prob=0.95)+ggtitle("Voiceless Data")
color_scheme_set(c(voiced_col,rep("#000000",5)))
mv_prep_gamm_nl<-plot_nonlinear(mv_prep_gamm,prob=0.95)+ggtitle("Voiced Data")
grid.arrange(ms_prep_gamm_nl,mv_prep_gamm_nl,ncol=1)
ms_prep_gamm_rope<-plot(rope(ms_prep_gamm,range=rope_range(ms_prep_gamm),ci_method='hdi'))+ggtitle("Voiceless")+scale_fill_manual(values=c(voiceless_col,voiceless_col_alt))+theme(legend.position='bottom')
ms_prep_gamm_rope
mv_prep_gamm_rope<-plot(rope(mv_prep_gamm,range=rope_range(mv_prep_gamm),ci_method='hdi'))+ggtitle("Voiced")+scale_fill_manual(values=c(voiced_col,voiced_col_alt))+theme(legend.position='bottom')
mv_prep_gamm_rope
Voiceless Data
launch_shinystan(ms_prep_gamm)
Voiced Data
launch_shinystan(mv_prep_gamm)
perceptual_data %>% filter(Coder!="") %>%
mutate(token=paste(Speaker,Word,Start,End,sep="_")) %>%
left_join(.,perceptual_tokens_voicing_codes,by='Word') %>%
left_join(.,socio_data,by='Speaker') %>%
mutate(AgeGroup=ifelse(as.numeric(BirthYear)>=1985,'Younger','Older')) %>%
filter(Code!="Unsure")->perceptual_dataset_raw
perceptual_dataset_raw %<>% set_contrasts()
perceptual_dataset_raw %>%
group_by(Speaker,AgeGroup,BirthYear,Sex,Voicing,Code) %>%
tally() %>% pivot_wider(names_from=Code,values_from=n) %>%
mutate(Total=Stop+Fricative,Stopping_Rate=100*Stop/(Total)) %>%
{.->> perceptual_rates_raw} %>% print()
perceptual_data %>% filter(Coder!="") %>%
mutate(token=paste(Speaker,Word,Start,End,sep="_")) %>%
left_join(.,perceptual_tokens_voicing_codes,by='Word') %>%
left_join(.,socio_data,by='Speaker') %>%
mutate(AgeGroup=ifelse(as.numeric(BirthYear)>=1985,'Younger','Older'),dh_value=case_match(Code,"Unsure"~1,"Fricative"~0,"Stop"~2)) %>% {.->>perceptual_group_level} %>%
group_by(AgeGroup,Sex) %>% reframe(dh_index=100*mean(dh_value))
perceptual_group_level %>%
group_by(Voicing,AgeGroup,Sex,Code) %>%
filter(Code!="Unsure") %>%
tally() %>% group_by(AgeGroup,Sex,Voicing) %>%
pivot_wider(names_from=Code,values_from=n) %>%
mutate(Total=Stop+Fricative,Stopping_Rate=100*Stop/(Total))
perceptual_group_level %>% filter(Coder!="") %>% group_by(Sex,AgeGroup) %>% reframe(count=n_distinct(Speaker))
Get Fleiss’ Kappa for IRR
perceptual_data %>% filter(Coder!="") %>%
mutate(token=paste(Speaker,Word,Start,End,sep="_")) %>%
select(token,Coder,Code) %>%
pivot_wider(names_from=`Coder`,values_from=Code) %>% {. ->> perceptual_irr} %>%
kappam.fleiss()
Fleiss' Kappa for m Raters
Subjects = 1020
Raters = 23
Kappa = 0.102
z = 75.9
p-value = 0
Show tokens by level of agreement
perceptual_irr %>% rowwise %>%
mutate(distinct = n_distinct(unlist(across(starts_with("A"))))) %>%
ungroup() -> perceptual_irr_agreement
perceptual_irr_agreement
Stopping Rates by Sex and Birth Year based on Individual Rater Score
perceptual_voiceless_rates_raw<- perceptual_rates_raw %>%
filter(Voicing=="Voiceless") %>%
ggplot(aes(x=BirthYear,y=Stopping_Rate,linetype=Sex))+
geom_rug(sides="br",linetype='solid')+
geom_smooth(aes(group=Sex),color='black',fill=voiceless_col,method='gam',formula=y~s(x,k=3,m=2))+
#scale_y_continuous(breaks=seq(-1.5,1.5,by=0.5))+
geom_point(alpha=0.5,position=position_jitter(0.3))+
labs(x="Birth Year",y="Stopping Rate\n(Perceptual, by Rater)")+
ggtitle("Voiceless")+scale_linetype_manual(values=c(male_line,female_line))+
theme(legend.position='bottom',strip.text=element_text(size=15))+scale_x_continuous(expand = expansion(mult = 0.1))
perceptual_voiced_rates_raw<- perceptual_rates_raw %>%
filter(Voicing=="Voiced") %>%
ggplot(aes(x=BirthYear,y=Stopping_Rate,linetype=Sex))+
geom_rug(sides="br",linetype='solid')+
geom_smooth(aes(group=Sex),color='black',fill=voiced_col,method='gam',formula=y~s(x,k=3,m=2))+
geom_point(alpha=0.5,position=position_jitter(0.3,0.3))+
labs(x="Birth Year",y="Stopping Rate\n(Perceptual, by Rater)")+
ggtitle("Voiced")+scale_linetype_manual(values=c(male_line,female_line))+
theme(legend.position='bottom',strip.text=element_text(size=15))+scale_x_continuous(expand = expansion(mult = 0.1))
grid.arrange(perceptual_voiceless_rates_raw,perceptual_voiced_rates_raw,ncol=2)
Force outcome variable to be factor for binomial gam
perceptual_dataset_raw$Code %<>% as.factor()
ms_per_raw_gamm<-stan_gamm4(Code~BirthYear*Sex+s(BirthYear,by=Sex,k=3,m=2,bs='tp'),random=~(1|Speaker),data=subset(perceptual_dataset_raw,Voicing=="Voiceless"),family='binomial',adapt_delta=0.999,warmup=2000,iter=6000,chains=8)
ms_per_raw_gamm_loo<-loo(ms_per_raw_gamm)
mv_per_raw_gamm<-stan_gamm4(Code~BirthYear*Sex+s(BirthYear,by=Sex,k=3,m=2,bs='tp'),random=~(1|Speaker),data=subset(perceptual_dataset_raw,Voicing=="Voiced"),family='binomial',adapt_delta=0.999,warmup=2000,iter=6000,chains=8)
mv_per_raw_gamm_loo<-loo(mv_per_raw_gamm,k_threshold=0.7)
summary(ms_per_raw_gamm,probs=my_probs,regex_pars=regex,digits=3)
Model Info:
function: stan_gamm4
family: binomial [logit]
formula: Code ~ BirthYear * Sex + s(BirthYear, by = Sex, k = 3, m = 2,
bs = "tp")
algorithm: sampling
sample: 32000 (posterior sample size)
priors: see help('prior_summary')
observations: 7848
groups: Speaker (32)
Estimates:
mean sd 2.75% 50% 97.5%
(Intercept) 0.473 62.158 -123.727 1.576 130.864
BirthYear -0.001 0.031 -0.064 -0.002 0.064
SexFemale 0.024 3.733 -7.133 0.031 7.313
BirthYear:SexFemale 0.000 0.002 -0.004 0.000 0.004
s(BirthYear):SexMale.1 -0.186 0.564 -1.526 -0.071 0.842
s(BirthYear):SexMale.2 0.066 0.403 -0.716 0.031 0.958
s(BirthYear):SexFemale.1 -0.435 0.459 -1.427 -0.366 0.267
s(BirthYear):SexFemale.2 -0.058 0.404 -0.883 -0.031 0.791
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.627 1.001 9827
BirthYear 0.000 1.001 9826
SexFemale 0.027 1.000 19162
BirthYear:SexFemale 0.000 1.000 19182
s(BirthYear):SexMale.1 0.005 1.000 15593
s(BirthYear):SexMale.2 0.004 1.001 10293
s(BirthYear):SexFemale.1 0.005 1.001 8224
s(BirthYear):SexFemale.2 0.004 1.001 9740
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
summary(mv_per_raw_gamm,probs=my_probs,regex_pars=regex,digits=3)
Model Info:
function: stan_gamm4
family: binomial [logit]
formula: Code ~ BirthYear * Sex + s(BirthYear, by = Sex, k = 3, m = 2,
bs = "tp")
algorithm: sampling
sample: 32000 (posterior sample size)
priors: see help('prior_summary')
observations: 12113
groups: Speaker (32)
Estimates:
mean sd 2.75% 50% 97.5%
(Intercept) -3.489 65.091 -132.824 -0.730 130.248
BirthYear 0.001 0.033 -0.064 0.000 0.070
SexFemale 0.047 3.711 -7.117 0.052 7.330
BirthYear:SexFemale 0.000 0.002 -0.004 0.000 0.004
s(BirthYear):SexMale.1 0.044 0.395 -0.746 0.014 0.937
s(BirthYear):SexMale.2 0.117 0.405 -0.667 0.083 0.971
s(BirthYear):SexFemale.1 -0.264 0.295 -0.892 -0.224 0.211
s(BirthYear):SexFemale.2 -0.119 0.407 -0.942 -0.084 0.710
MCMC diagnostics
mcse Rhat n_eff
(Intercept) 0.719 1.001 8196
BirthYear 0.000 1.001 8195
SexFemale 0.029 1.000 16582
BirthYear:SexFemale 0.000 1.000 16602
s(BirthYear):SexMale.1 0.003 1.000 12795
s(BirthYear):SexMale.2 0.004 1.001 8416
s(BirthYear):SexFemale.1 0.004 1.001 7048
s(BirthYear):SexFemale.2 0.005 1.001 8160
For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
color_scheme_set(c(voiceless_col,rep("#000000",5)))
ms_per_raw_gamm_nl<-plot_nonlinear(ms_per_raw_gamm,prob=0.95)+ggtitle("Voiceless Data")
color_scheme_set(c(voiced_col,rep("#000000",5)))
mv_per_raw_gamm_nl<-plot_nonlinear(mv_per_raw_gamm,prob=0.95)+ggtitle("Voiced Data")
grid.arrange(ms_per_raw_gamm_nl,mv_per_raw_gamm_nl,ncol=1)
hdi(ms_per_raw_gamm)
Highest Density Interval
Parameter | 95% HDI
---------------------------------------
(Intercept) | [-126.58, 132.76]
BirthYear | [ -0.07, 0.06]
SexFemale | [ -7.16, 7.45]
BirthYear:SexFemale | [ -0.00, 0.00]
# Fixed effects smooth_sd
Parameter | Function | 95% HDI
------------------------------------------------------
s(BirthYear):SexMale1 | smooth | [ 0.00, 2.24]
s(BirthYear):SexMale2 | smooth | [ 0.00, 1.95]
s(BirthYear):SexFemale1 | smooth | [ 0.00, 2.33]
s(BirthYear):SexFemale2 | smooth | [ 0.00, 1.95]
bayesfactor(ms_per_raw_gamm)
Sampling priors, please wait...
Warning: Bayes factors might not be precise.
For precise Bayes factors, sampling at least 40,000 posterior samples is recommended.
Bayes Factor (Savage-Dickey density ratio)
Parameter | BF
---------------------------
(Intercept) | 0.098
BirthYear | 0.100
SexFemale | 0.702
BirthYear:SexFemale | 0.695
* Evidence Against The Null: 0
rope(ms_per_raw_gamm,range=rope_range(ms_per_raw_gamm),ci_method='hdi',ci=0.95)
# Proportion of samples inside the ROPE [-0.18, 0.18]:
# Fixed Effects (Conditional Model)
Parameter | inside ROPE
---------------------------------
(Intercept) | 0.43 %
BirthYear | 100.00 %
SexFemale | 4.09 %
BirthYear:SexFemale | 100.00 %
# Smooth Terms
Parameter | inside ROPE
-------------------------------------
s(BirthYear):SexMale1 | 22.61 %
s(BirthYear):SexMale2 | 29.21 %
s(BirthYear):SexFemale1 | 16.19 %
s(BirthYear):SexFemale2 | 28.46 %
hdi(mv_per_raw_gamm)
Highest Density Interval
Parameter | 95% HDI
---------------------------------------
(Intercept) | [-134.37, 134.05]
BirthYear | [ -0.07, 0.07]
SexFemale | [ -7.04, 7.53]
BirthYear:SexFemale | [ -0.00, 0.00]
# Fixed effects smooth_sd
Parameter | Function | 95% HDI
------------------------------------------------------
s(BirthYear):SexMale1 | smooth | [ 0.00, 1.94]
s(BirthYear):SexMale2 | smooth | [ 0.00, 1.98]
s(BirthYear):SexFemale1 | smooth | [ 0.00, 1.99]
s(BirthYear):SexFemale2 | smooth | [ 0.00, 1.99]
bayesfactor(mv_per_raw_gamm)
Sampling priors, please wait...
Warning: Bayes factors might not be precise.
For precise Bayes factors, sampling at least 40,000 posterior samples is recommended.
Bayes Factor (Savage-Dickey density ratio)
Parameter | BF
---------------------------
(Intercept) | 0.096
BirthYear | 0.097
SexFemale | 0.710
BirthYear:SexFemale | 0.698
* Evidence Against The Null: 0
rope(mv_per_raw_gamm,range=rope_range(mv_per_raw_gamm),ci_method='hdi',ci=0.95)
# Proportion of samples inside the ROPE [-0.18, 0.18]:
# Fixed Effects (Conditional Model)
Parameter | inside ROPE
---------------------------------
(Intercept) | 0.40 %
BirthYear | 100.00 %
SexFemale | 4.22 %
BirthYear:SexFemale | 100.00 %
# Smooth Terms
Parameter | inside ROPE
-------------------------------------
s(BirthYear):SexMale1 | 26.54 %
s(BirthYear):SexMale2 | 26.59 %
s(BirthYear):SexFemale1 | 22.01 %
s(BirthYear):SexFemale2 | 26.28 %
color_scheme_set(c(voiceless_col,rep("#000000",5)))
ms_per_raw_post<-mcmc_areas(ms_per_raw,regex_pars=regex,prob_outer=0.95)+ggtitle("Voiceless")
color_scheme_set(c(voiced_col,rep("#000000",5)))
mv_per_raw_post<-mcmc_areas(mv_per_raw,regex_pars=regex,prob_outer=0.95)+ggtitle("Voiced")
grid.arrange(ms_per_raw_post,mv_per_raw_post,ncol=1)
ms_per_raw_rope<-plot(rope(ms_per_raw,range=rope_range(ms_per_raw)))+ggtitle("Voiceless")+scale_fill_manual(values=c(voiceless_col,voiceless_col_alt))
Possible multicollinearity between AgeGroupYounger:SexFemale and AgeGroupYounger (r = 0.79), Sigma[Word:SexFemale,SexFemale] and Sigma[Word:AgeGroupYounger:SexFemale,AgeGroupYounger] (r = 0.7), Sigma[Word:AgeGroupYounger:SexFemale,AgeGroupYounger:SexFemale] and Sigma[Word:SexFemale,SexFemale] (r = 0.78). This might lead to inappropriate results. See 'Details' in '?rope'.
mv_per_raw_rope<-plot(rope(mv_per_raw,range=rope_range(mv_per_raw)))+ggtitle("Voiced")+scale_fill_manual(values=c(voiced_col,voiced_col_alt))
Possible multicollinearity between AgeGroupYounger:SexFemale and AgeGroupYounger (r = 0.74). This might lead to inappropriate results. See 'Details' in '?rope'.
grid.arrange(ms_per_raw_rope,mv_per_raw_rope,ncol=1)
Voiceless Data
launch_shinystan(ms_per_raw)
Voiced Data
launch_shinystan(mv_per_raw)
save.image('../Data/all_models_final.RData')